/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"__vsin.S"

#include "libm.h"

	RO_DATA
	.align	64
constants:
	.word	0x3ec718e3,0xa6972785
	.word	0x3ef9fd39,0x94293940
	.word	0xbf2a019f,0x75ee4be1
	.word	0xbf56c16b,0xba552569
	.word	0x3f811111,0x1108c703
	.word	0x3fa55555,0x554f5b35
	.word	0xbfc55555,0x555554d0
	.word	0xbfdfffff,0xffffff85
	.word	0x3ff00000,0x00000000
	.word	0xbfc55555,0x5551fc28
	.word	0x3f811107,0x62eacc9d
	.word	0xbfdfffff,0xffff6328
	.word	0x3fa55551,0x5f7acf0c
	.word	0x3fe45f30,0x6dc9c883
	.word	0x43380000,0x00000000
	.word	0x3ff921fb,0x54400000
	.word	0x3dd0b461,0x1a600000
	.word	0x3ba3198a,0x2e000000
	.word	0x397b839a,0x252049c1
	.word	0x80000000,0x00004000
	.word	0xffff8000,0x00000000	! N.B.: low-order words used
	.word	0x3fc90000,0x80000000	! for sign bit hacking; see
	.word	0x3fc40000,0x00000000	! references to "thresh" below

#define p4		0x0
#define q4		0x08
#define p3		0x10
#define q3		0x18
#define p2		0x20
#define q2		0x28
#define p1		0x30
#define q1		0x38
#define one		0x40
#define pp1		0x48
#define pp2		0x50
#define qq1		0x58
#define qq2		0x60
#define invpio2		0x68
#define round		0x70
#define pio2_1		0x78
#define pio2_2		0x80
#define pio2_3		0x88
#define pio2_3t		0x90
#define f30val		0x98
#define mask		0xa0
#define thresh		0xa8

! local storage indices

#define xsave		STACK_BIAS-0x8
#define ysave		STACK_BIAS-0x10
#define nsave		STACK_BIAS-0x14
#define sxsave		STACK_BIAS-0x18
#define sysave		STACK_BIAS-0x1c
#define biguns		STACK_BIAS-0x20
#define n2		STACK_BIAS-0x24
#define n1		STACK_BIAS-0x28
#define n0		STACK_BIAS-0x2c
#define x2_1		STACK_BIAS-0x40
#define x1_1		STACK_BIAS-0x50
#define x0_1		STACK_BIAS-0x60
#define y2_0		STACK_BIAS-0x70
#define y1_0		STACK_BIAS-0x80
#define y0_0		STACK_BIAS-0x90
! sizeof temp storage - must be a multiple of 16 for V9
#define tmps		0x90

!--------------------------------------------------------------
!	Some defines to keep code more readable
#define LIM_l6		%l6
!	in primary range, contains |x| upper limit when cos(x)=1.
!	in transferring to medium range, denotes what loop was active.
!--------------------------------------------------------------

	ENTRY(__vsin)
	save	%sp,-SA(MINFRAME)-tmps,%sp
	PIC_SETUP(g5)
	PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
	PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
	PIC_SET(g5,constants,l5)
	mov	%l5,%g1
	wr	%g0,0x82,%asi		! set %asi for non-faulting loads

! ========== primary range ==========

! register use

! i0  n
! i1  x
! i2  stridex
! i3  y
! i4  stridey
! i5  0x80000000

! l0  hx0
! l1  hx1
! l2  hx2
! l3  __vlibm_TBL_sincos_hi
! l4  __vlibm_TBL_sincos_lo
! l5  0x3fc90000
! l6  0x3e400000
! l7  0x3fe921fb

! the following are 64-bit registers in both V8+ and V9

! g1  scratch
! g5

! o0  py0
! o1  py1
! o2  py2
! o3  oy0
! o4  oy1
! o5  oy2
! o7  scratch

! f0  x0
! f2
! f4
! f6
! f8  scratch for table base
! f9  signbit0
! f10 x1
! f12
! f14
! f16
! f18 scratch for table base
! f19 signbit1
! f20 x2
! f22
! f24
! f26
! f28 scratch for table base
! f29 signbit2
! f30 0x80000000
! f31 0x4000
! f32
! f34
! f36
! f38
! f40
! f42
! f44 0xffff800000000000
! f46 p1
! f48 p2
! f50 p3
! f52 p4
! f54 one
! f56 pp1
! f58 pp2
! f60 qq1
! f62 qq2

#ifdef __sparcv9
	stx	%i1,[%fp+xsave]		! save arguments
	stx	%i3,[%fp+ysave]
#else
	st	%i1,[%fp+xsave]		! save arguments
	st	%i3,[%fp+ysave]
#endif
	st	%i0,[%fp+nsave]
	st	%i2,[%fp+sxsave]
	st	%i4,[%fp+sysave]
	sethi	%hi(0x80000000),%i5	! load/set up constants
	sethi	%hi(0x3fc90000),%l5
	sethi	%hi(0x3e400000),LIM_l6
	sethi	%hi(0x3fe921fb),%l7
	or	%l7,%lo(0x3fe921fb),%l7
	ldd	[%g1+f30val],%f30
	ldd	[%g1+mask],%f44
	ldd	[%g1+p1],%f46
	ldd	[%g1+p2],%f48
	ldd	[%g1+p3],%f50
	ldd	[%g1+p4],%f52
	ldd	[%g1+one],%f54
	ldd	[%g1+pp1],%f56
	ldd	[%g1+pp2],%f58
	ldd	[%g1+qq1],%f60
	ldd	[%g1+qq2],%f62
	sll	%i2,3,%i2		! scale strides
	sll	%i4,3,%i4
	add	%fp,x0_1,%o3		! precondition loop
	add	%fp,x0_1,%o4
	add	%fp,x0_1,%o5
	ld	[%i1],%l0		! hx = *x
	ld	[%i1],%f0
	ld	[%i1+4],%f1
	andn	%l0,%i5,%l0		! hx &= ~0x80000000
	add	%i1,%i2,%i1		! x += stridex

	ba,pt	%icc,.loop0
! delay slot
	nop

	.align 32
.loop0:
	lda	[%i1]%asi,%l1		! preload next argument
	sub	%l0,LIM_l6,%g1
	sub	%l7,%l0,%o7
	fands	%f0,%f30,%f9		! save signbit

	lda	[%i1]%asi,%f10
	orcc	%o7,%g1,%g0
	mov	%i3,%o0			! py0 = y
	bl,pn	%icc,.range0		! if hx < 0x3e400000 or > 0x3fe921fb

! delay slot
	lda	[%i1+4]%asi,%f11
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! y += stridey
	ble,pn	%icc,.endloop1

! delay slot
	andn	%l1,%i5,%l1
	add	%i1,%i2,%i1		! x += stridex
	fabsd	%f0,%f0
	fmuld	%f54,%f54,%f54		! one*one; a nop for alignment only

.loop1:
	lda	[%i1]%asi,%l2		! preload next argument
	sub	%l1,LIM_l6,%g1
	sub	%l7,%l1,%o7
	fands	%f10,%f30,%f19		! save signbit

	lda	[%i1]%asi,%f20
	orcc	%o7,%g1,%g0
	mov	%i3,%o1			! py1 = y
	bl,pn	%icc,.range1		! if hx < 0x3e400000 or > 0x3fe921fb

! delay slot
	lda	[%i1+4]%asi,%f21
	addcc	%i0,-1,%i0
	add	%i3,%i4,%i3		! y += stridey
	ble,pn	%icc,.endloop2

! delay slot
	andn	%l2,%i5,%l2
	add	%i1,%i2,%i1		! x += stridex
	fabsd	%f10,%f10
	fmuld	%f54,%f54,%f54		! one*one; a nop for alignment only

.loop2:
	st	%f6,[%o3]
	sub	%l2,LIM_l6,%g1
	sub	%l7,%l2,%o7
	fands	%f20,%f30,%f29		! save signbit

	st	%f7,[%o3+4]
	orcc	%g1,%o7,%g0
	mov	%i3,%o2			! py2 = y
	bl,pn	%icc,.range2		! if hx < 0x3e400000 or > 0x3fe921fb

! delay slot
	add	%i3,%i4,%i3		! y += stridey
	cmp	%l0,%l5
	fabsd	%f20,%f20
	bl,pn	%icc,.case4

! delay slot
	st	%f16,[%o4]
	cmp	%l1,%l5
	fpadd32s %f0,%f31,%f8
	bl,pn	%icc,.case2

! delay slot
	st	%f17,[%o4+4]
	cmp	%l2,%l5
	fpadd32s %f10,%f31,%f18
	bl,pn	%icc,.case1

! delay slot
	st	%f26,[%o5]
	mov	%o0,%o3
	sethi	%hi(0x3fc3c000),%o7
	fpadd32s %f20,%f31,%f28

	st	%f27,[%o5+4]
	fand	%f8,%f44,%f2
	mov	%o1,%o4

	fand	%f18,%f44,%f12
	mov	%o2,%o5
	sub	%l0,%o7,%l0

	fand	%f28,%f44,%f22
	sub	%l1,%o7,%l1
	sub	%l2,%o7,%l2

	fsubd	%f0,%f2,%f0
	srl	%l0,10,%l0
	add	%l3,8,%g1

	fsubd	%f10,%f12,%f10
	srl	%l1,10,%l1

	fsubd	%f20,%f22,%f20
	srl	%l2,10,%l2

	fmuld	%f0,%f0,%f2
	andn	%l0,0x1f,%l0

	fmuld	%f10,%f10,%f12
	andn	%l1,0x1f,%l1

	fmuld	%f20,%f20,%f22
	andn	%l2,0x1f,%l2

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f36

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f40

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4
	ldd	[%g1+%l0],%f34

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14
	ldd	[%g1+%l1],%f38

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24
	ldd	[%g1+%l2],%f42

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f2

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f12

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f22

	fmuld	%f4,%f32,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f14,%f36,%f14
	lda	[%i1]%asi,%f0

	fmuld	%f24,%f40,%f24
	lda	[%i1+4]%asi,%f1

	fmuld	%f6,%f34,%f6
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f16,%f38,%f16

	fmuld	%f26,%f42,%f26

	faddd	%f6,%f4,%f6

	faddd	%f16,%f14,%f16

	faddd	%f26,%f24,%f26

	faddd	%f6,%f2,%f6

	faddd	%f16,%f12,%f16

	faddd	%f26,%f22,%f26

	faddd	%f6,%f32,%f6

	faddd	%f16,%f36,%f16

	faddd	%f26,%f40,%f26
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	fors	%f6,%f9,%f6
	addcc	%i0,-1,%i0

	fors	%f16,%f19,%f16
	bg,pt	%icc,.loop0

! delay slot
	fors	%f26,%f29,%f26

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case1:
	st	%f27,[%o5+4]
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f8,%f44,%f2

	sub	%l0,%o7,%l0
	sub	%l1,%o7,%l1
	fand	%f18,%f44,%f12
	fmuld	%f20,%f20,%f22

	fsubd	%f0,%f2,%f0
	srl	%l0,10,%l0
	mov	%o0,%o3

	fsubd	%f10,%f12,%f10
	srl	%l1,10,%l1
	mov	%o1,%o4

	fmuld	%f22,%f52,%f24
	mov	%o2,%o5

	fmuld	%f0,%f0,%f2
	andn	%l0,0x1f,%l0

	fmuld	%f10,%f10,%f12
	andn	%l1,0x1f,%l1

	faddd	%f24,%f50,%f24

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f36

	fmuld	%f22,%f24,%f24

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4
	ldd	[%g1+%l0],%f34

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14
	ldd	[%g1+%l1],%f38

	faddd	%f24,%f48,%f24

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f22,%f24,%f24

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14

	faddd	%f24,%f46,%f24

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f2

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f12

	fmuld	%f4,%f32,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f14,%f36,%f14
	lda	[%i1]%asi,%f0

	fmuld	%f6,%f34,%f6
	lda	[%i1+4]%asi,%f1

	fmuld	%f16,%f38,%f16
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f22,%f24,%f24

	faddd	%f6,%f4,%f6

	faddd	%f16,%f14,%f16

	fmuld	%f20,%f24,%f24

	faddd	%f6,%f2,%f6

	faddd	%f16,%f12,%f16

	faddd	%f20,%f24,%f26

	faddd	%f6,%f32,%f6

	faddd	%f16,%f36,%f16
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	fors	%f26,%f29,%f26
	addcc	%i0,-1,%i0

	fors	%f6,%f9,%f6
	bg,pt	%icc,.loop0

! delay slot
	fors	%f16,%f19,%f16

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case2:
	st	%f26,[%o5]
	cmp	%l2,%l5
	fpadd32s %f20,%f31,%f28
	bl,pn	%icc,.case3

! delay slot
	st	%f27,[%o5+4]
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f8,%f44,%f2

	sub	%l0,%o7,%l0
	sub	%l2,%o7,%l2
	fand	%f28,%f44,%f22
	fmuld	%f10,%f10,%f12

	fsubd	%f0,%f2,%f0
	srl	%l0,10,%l0
	mov	%o0,%o3

	fsubd	%f20,%f22,%f20
	srl	%l2,10,%l2
	mov	%o2,%o5

	fmuld	%f12,%f52,%f14
	mov	%o1,%o4

	fmuld	%f0,%f0,%f2
	andn	%l0,0x1f,%l0

	fmuld	%f20,%f20,%f22
	andn	%l2,0x1f,%l2

	faddd	%f14,%f50,%f14

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f40

	fmuld	%f12,%f14,%f14

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4
	ldd	[%g1+%l0],%f34

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24
	ldd	[%g1+%l2],%f42

	faddd	%f14,%f48,%f14

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f12,%f14,%f14

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24

	faddd	%f14,%f46,%f14

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f2

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f22

	fmuld	%f4,%f32,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f24,%f40,%f24
	lda	[%i1]%asi,%f0

	fmuld	%f6,%f34,%f6
	lda	[%i1+4]%asi,%f1

	fmuld	%f26,%f42,%f26
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f12,%f14,%f14

	faddd	%f6,%f4,%f6

	faddd	%f26,%f24,%f26

	fmuld	%f10,%f14,%f14

	faddd	%f6,%f2,%f6

	faddd	%f26,%f22,%f26

	faddd	%f10,%f14,%f16

	faddd	%f6,%f32,%f6

	faddd	%f26,%f40,%f26
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	fors	%f16,%f19,%f16
	addcc	%i0,-1,%i0

	fors	%f6,%f9,%f6
	bg,pt	%icc,.loop0

! delay slot
	fors	%f26,%f29,%f26

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case3:
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f8,%f44,%f2
	fmuld	%f10,%f10,%f12

	sub	%l0,%o7,%l0
	fmuld	%f20,%f20,%f22

	fsubd	%f0,%f2,%f0
	srl	%l0,10,%l0
	mov	%o0,%o3

	fmuld	%f12,%f52,%f14
	mov	%o1,%o4

	fmuld	%f22,%f52,%f24
	mov	%o2,%o5

	fmuld	%f0,%f0,%f2
	andn	%l0,0x1f,%l0

	faddd	%f14,%f50,%f14

	faddd	%f24,%f50,%f24

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f12,%f14,%f14

	fmuld	%f22,%f24,%f24

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4
	ldd	[%g1+%l0],%f34

	faddd	%f14,%f48,%f14

	faddd	%f24,%f48,%f24

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f12,%f14,%f14

	fmuld	%f22,%f24,%f24

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4

	faddd	%f14,%f46,%f14

	faddd	%f24,%f46,%f24

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f2

	fmuld	%f4,%f32,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f12,%f14,%f14
	lda	[%i1]%asi,%f0

	fmuld	%f6,%f34,%f6
	lda	[%i1+4]%asi,%f1

	fmuld	%f22,%f24,%f24
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f10,%f14,%f14

	faddd	%f6,%f4,%f6

	fmuld	%f20,%f24,%f24

	faddd	%f10,%f14,%f16

	faddd	%f6,%f2,%f6

	faddd	%f20,%f24,%f26

	fors	%f16,%f19,%f16
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	faddd	%f6,%f32,%f6
	addcc	%i0,-1,%i0

	fors	%f26,%f29,%f26
	bg,pt	%icc,.loop0

! delay slot
	fors	%f6,%f9,%f6

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case4:
	st	%f17,[%o4+4]
	cmp	%l1,%l5
	fpadd32s %f10,%f31,%f18
	bl,pn	%icc,.case6

! delay slot
	st	%f26,[%o5]
	cmp	%l2,%l5
	fpadd32s %f20,%f31,%f28
	bl,pn	%icc,.case5

! delay slot
	st	%f27,[%o5+4]
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f18,%f44,%f12

	sub	%l1,%o7,%l1
	sub	%l2,%o7,%l2
	fand	%f28,%f44,%f22
	fmuld	%f0,%f0,%f2

	fsubd	%f10,%f12,%f10
	srl	%l1,10,%l1
	mov	%o1,%o4

	fsubd	%f20,%f22,%f20
	srl	%l2,10,%l2
	mov	%o2,%o5

	fmovd	%f0,%f6
	fmuld	%f2,%f52,%f4
	mov	%o0,%o3

	fmuld	%f10,%f10,%f12
	andn	%l1,0x1f,%l1

	fmuld	%f20,%f20,%f22
	andn	%l2,0x1f,%l2

	faddd	%f4,%f50,%f4

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f36

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f40

	fmuld	%f2,%f4,%f4

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14
	ldd	[%g1+%l1],%f38

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24
	ldd	[%g1+%l2],%f42

	faddd	%f4,%f48,%f4

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f2,%f4,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24

	faddd	%f4,%f46,%f4

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f12

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f22

	fmuld	%f14,%f36,%f14
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f24,%f40,%f24
	lda	[%i1]%asi,%f0

	fmuld	%f16,%f38,%f16
	lda	[%i1+4]%asi,%f1

	fmuld	%f26,%f42,%f26
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f2,%f4,%f4

	faddd	%f16,%f14,%f16

	faddd	%f26,%f24,%f26

	fmuld	%f6,%f4,%f4

	faddd	%f16,%f12,%f16

	faddd	%f26,%f22,%f26

	faddd	%f6,%f4,%f6

	faddd	%f16,%f36,%f16

	faddd	%f26,%f40,%f26
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	fors	%f6,%f9,%f6
	addcc	%i0,-1,%i0

	fors	%f16,%f19,%f16
	bg,pt	%icc,.loop0

! delay slot
	fors	%f26,%f29,%f26

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case5:
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f18,%f44,%f12
	fmuld	%f0,%f0,%f2

	sub	%l1,%o7,%l1
	fmuld	%f20,%f20,%f22

	fsubd	%f10,%f12,%f10
	srl	%l1,10,%l1
	mov	%o1,%o4

	fmovd	%f0,%f6
	fmuld	%f2,%f52,%f4
	mov	%o0,%o3

	fmuld	%f22,%f52,%f24
	mov	%o2,%o5

	fmuld	%f10,%f10,%f12
	andn	%l1,0x1f,%l1

	faddd	%f4,%f50,%f4

	faddd	%f24,%f50,%f24

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f36

	fmuld	%f2,%f4,%f4

	fmuld	%f22,%f24,%f24

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14
	ldd	[%g1+%l1],%f38

	faddd	%f4,%f48,%f4

	faddd	%f24,%f48,%f24

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f2,%f4,%f4

	fmuld	%f22,%f24,%f24

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14

	faddd	%f4,%f46,%f4

	faddd	%f24,%f46,%f24

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f12

	fmuld	%f14,%f36,%f14
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f2,%f4,%f4
	lda	[%i1]%asi,%f0

	fmuld	%f16,%f38,%f16
	lda	[%i1+4]%asi,%f1

	fmuld	%f22,%f24,%f24
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f6,%f4,%f4

	faddd	%f16,%f14,%f16

	fmuld	%f20,%f24,%f24

	faddd	%f6,%f4,%f6

	faddd	%f16,%f12,%f16

	faddd	%f20,%f24,%f26

	fors	%f6,%f9,%f6
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	faddd	%f16,%f36,%f16
	addcc	%i0,-1,%i0

	fors	%f26,%f29,%f26
	bg,pt	%icc,.loop0

! delay slot
	fors	%f16,%f19,%f16

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case6:
	st	%f27,[%o5+4]
	cmp	%l2,%l5
	fpadd32s %f20,%f31,%f28
	bl,pn	%icc,.case7

! delay slot
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fand	%f28,%f44,%f22
	fmuld	%f0,%f0,%f2

	sub	%l2,%o7,%l2
	fmuld	%f10,%f10,%f12

	fsubd	%f20,%f22,%f20
	srl	%l2,10,%l2
	mov	%o2,%o5

	fmovd	%f0,%f6
	fmuld	%f2,%f52,%f4
	mov	%o0,%o3

	fmuld	%f12,%f52,%f14
	mov	%o1,%o4

	fmuld	%f20,%f20,%f22
	andn	%l2,0x1f,%l2

	faddd	%f4,%f50,%f4

	faddd	%f14,%f50,%f14

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f40

	fmuld	%f2,%f4,%f4

	fmuld	%f12,%f14,%f14

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24
	ldd	[%g1+%l2],%f42

	faddd	%f4,%f48,%f4

	faddd	%f14,%f48,%f14

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f2,%f4,%f4

	fmuld	%f12,%f14,%f14

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24

	faddd	%f4,%f46,%f4

	faddd	%f14,%f46,%f14

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f22

	fmuld	%f24,%f40,%f24
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f2,%f4,%f4
	lda	[%i1]%asi,%f0

	fmuld	%f26,%f42,%f26
	lda	[%i1+4]%asi,%f1

	fmuld	%f12,%f14,%f14
	add	%i1,%i2,%i1		! x += stridex

	fmuld	%f6,%f4,%f4

	faddd	%f26,%f24,%f26

	fmuld	%f10,%f14,%f14

	faddd	%f6,%f4,%f6

	faddd	%f26,%f22,%f26

	faddd	%f10,%f14,%f16

	fors	%f6,%f9,%f6
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	faddd	%f26,%f40,%f26
	addcc	%i0,-1,%i0

	fors	%f16,%f19,%f16
	bg,pt	%icc,.loop0

! delay slot
	fors	%f26,%f29,%f26

	ba,pt	%icc,.endloop0
! delay slot
	nop

	.align	32
.case7:
	fmuld	%f0,%f0,%f2
	fmovd	%f0,%f6
	mov	%o0,%o3

	fmuld	%f10,%f10,%f12
	mov	%o1,%o4

	fmuld	%f20,%f20,%f22
	mov	%o2,%o5

	fmuld	%f2,%f52,%f4
	lda	[%i1]%asi,%l0		! preload next argument

	fmuld	%f12,%f52,%f14
	lda	[%i1]%asi,%f0

	fmuld	%f22,%f52,%f24
	lda	[%i1+4]%asi,%f1

	faddd	%f4,%f50,%f4
	add	%i1,%i2,%i1		! x += stridex

	faddd	%f14,%f50,%f14

	faddd	%f24,%f50,%f24

	fmuld	%f2,%f4,%f4

	fmuld	%f12,%f14,%f14

	fmuld	%f22,%f24,%f24

	faddd	%f4,%f48,%f4

	faddd	%f14,%f48,%f14

	faddd	%f24,%f48,%f24

	fmuld	%f2,%f4,%f4

	fmuld	%f12,%f14,%f14

	fmuld	%f22,%f24,%f24

	faddd	%f4,%f46,%f4

	faddd	%f14,%f46,%f14

	faddd	%f24,%f46,%f24

	fmuld	%f2,%f4,%f4

	fmuld	%f12,%f14,%f14

	fmuld	%f22,%f24,%f24

	fmuld	%f6,%f4,%f4

	fmuld	%f10,%f14,%f14

	fmuld	%f20,%f24,%f24

	faddd	%f6,%f4,%f6

	faddd	%f10,%f14,%f16

	faddd	%f20,%f24,%f26
	andn	%l0,%i5,%l0		! hx &= ~0x80000000

	fors	%f6,%f9,%f6
	addcc	%i0,-1,%i0

	fors	%f16,%f19,%f16
	bg,pt	%icc,.loop0

! delay slot
	fors	%f26,%f29,%f26

	ba,pt	%icc,.endloop0
! delay slot
	nop


	.align	32
.endloop2:
	cmp	%l1,%l5
	bl,pn	%icc,1f
! delay slot
	fabsd	%f10,%f10
	sethi	%hi(0x3fc3c000),%o7
	fpadd32s %f10,%f31,%f18
	add	%l3,8,%g1
	fand	%f18,%f44,%f12
	sub	%l1,%o7,%l1
	fsubd	%f10,%f12,%f10
	srl	%l1,10,%l1
	fmuld	%f10,%f10,%f12
	andn	%l1,0x1f,%l1
	fmuld	%f12,%f58,%f20
	ldd	[%l3+%l1],%f36
	faddd	%f20,%f56,%f20
	fmuld	%f12,%f62,%f14
	ldd	[%g1+%l1],%f38
	fmuld	%f12,%f20,%f20
	faddd	%f14,%f60,%f14
	faddd	%f20,%f54,%f20
	fmuld	%f12,%f14,%f14
	fmuld	%f10,%f20,%f20
	ldd	[%l4+%l1],%f12
	fmuld	%f14,%f36,%f14
	fmuld	%f20,%f38,%f20
	faddd	%f20,%f14,%f20
	faddd	%f20,%f12,%f20
	ba,pt	%icc,2f
! delay slot
	faddd	%f20,%f36,%f20
1:
	fmuld	%f10,%f10,%f12
	fmuld	%f12,%f52,%f14
	faddd	%f14,%f50,%f14
	fmuld	%f12,%f14,%f14
	faddd	%f14,%f48,%f14
	fmuld	%f12,%f14,%f14
	faddd	%f14,%f46,%f14
	fmuld	%f12,%f14,%f14
	fmuld	%f10,%f14,%f14
	faddd	%f10,%f14,%f20
2:
	fors	%f20,%f19,%f20
	st	%f20,[%o1]
	st	%f21,[%o1+4]

.endloop1:
	cmp	%l0,%l5
	bl,pn	%icc,1f
! delay slot
	fabsd	%f0,%f0
	sethi	%hi(0x3fc3c000),%o7
	fpadd32s %f0,%f31,%f8
	add	%l3,8,%g1
	fand	%f8,%f44,%f2
	sub	%l0,%o7,%l0
	fsubd	%f0,%f2,%f0
	srl	%l0,10,%l0
	fmuld	%f0,%f0,%f2
	andn	%l0,0x1f,%l0
	fmuld	%f2,%f58,%f20
	ldd	[%l3+%l0],%f32
	faddd	%f20,%f56,%f20
	fmuld	%f2,%f62,%f4
	ldd	[%g1+%l0],%f34
	fmuld	%f2,%f20,%f20
	faddd	%f4,%f60,%f4
	faddd	%f20,%f54,%f20
	fmuld	%f2,%f4,%f4
	fmuld	%f0,%f20,%f20
	ldd	[%l4+%l0],%f2
	fmuld	%f4,%f32,%f4
	fmuld	%f20,%f34,%f20
	faddd	%f20,%f4,%f20
	faddd	%f20,%f2,%f20
	ba,pt	%icc,2f
! delay slot
	faddd	%f20,%f32,%f20
1:
	fmuld	%f0,%f0,%f2
	fmuld	%f2,%f52,%f4
	faddd	%f4,%f50,%f4
	fmuld	%f2,%f4,%f4
	faddd	%f4,%f48,%f4
	fmuld	%f2,%f4,%f4
	faddd	%f4,%f46,%f4
	fmuld	%f2,%f4,%f4
	fmuld	%f0,%f4,%f4
	faddd	%f0,%f4,%f20
2:
	fors	%f20,%f9,%f20
	st	%f20,[%o0]
	st	%f21,[%o0+4]

.endloop0:
	st	%f6,[%o3]
	st	%f7,[%o3+4]
	st	%f16,[%o4]
	st	%f17,[%o4+4]
	st	%f26,[%o5]
	st	%f27,[%o5+4]

! return.  finished off with only primary range arguments.

	ret
	restore


	.align	32
.range0:
	cmp	%l0,LIM_l6
	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
! delay slot, annulled if branch not taken
	mov	0x1,LIM_l6		! set "processing loop0"
	st	%f0,[%o0]		! *y = *x with inexact if x nonzero
	st	%f1,[%o0+4]
	fdtoi	%f0,%f2
	addcc	%i0,-1,%i0
	ble,pn	%icc,.endloop0
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l1,%i5,%l0		! hx &= ~0x80000000
	fmovd	%f10,%f0
	ba,pt	%icc,.loop0
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.range1:
	cmp	%l1,LIM_l6
	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
! delay slot, annulled if branch not taken
	mov	0x2,LIM_l6		! set "processing loop1"
	st	%f10,[%o1]		! *y = *x with inexact if x nonzero
	st	%f11,[%o1+4]
	fdtoi	%f10,%f12
	addcc	%i0,-1,%i0
	ble,pn	%icc,.endloop1
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l2,%i5,%l1		! hx &= ~0x80000000
	fmovd	%f20,%f10
	ba,pt	%icc,.loop1
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.range2:
	cmp	%l2,LIM_l6
	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
! delay slot, annulled if branch not taken
	mov	0x3,LIM_l6		! set "processing loop2"
	st	%f20,[%o2]		! *y = *x with inexact if x nonzero
	st	%f21,[%o2+4]
	fdtoi	%f20,%f22
1:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.endloop2
! delay slot
	nop
	ld	[%i1],%l2
	ld	[%i1],%f20
	ld	[%i1+4],%f21
	andn	%l2,%i5,%l2		! hx &= ~0x80000000
	ba,pt	%icc,.loop2
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.MEDIUM:

! ========== medium range ==========

! register use

! i0  n
! i1  x
! i2  stridex
! i3  y
! i4  stridey
! i5  0x80000000

! l0  hx0
! l1  hx1
! l2  hx2
! l3  __vlibm_TBL_sincos_hi
! l4  __vlibm_TBL_sincos_lo
! l5  constants
! l6  in transition from pri-range and here, use for biguns
! l7  0x413921fb

! the following are 64-bit registers in both V8+ and V9

! g1  scratch
! g5

! o0  py0
! o1  py1
! o2  py2
! o3  n0
! o4  n1
! o5  n2
! o7  scratch

! f0  x0
! f2  n0,y0
! f4
! f6
! f8  scratch for table base
! f9  signbit0
! f10 x1
! f12 n1,y1
! f14
! f16
! f18 scratch for table base
! f19 signbit1
! f20 x2
! f22 n2,y2
! f24
! f26
! f28 scratch for table base
! f29 signbit2
! f30 0x80000000
! f31 0x4000
! f32
! f34
! f36
! f38
! f40 invpio2
! f42 round
! f44 0xffff800000000000
! f46 pio2_1
! f48 pio2_2
! f50 pio2_3
! f52 pio2_3t
! f54 one
! f56 pp1
! f58 pp2
! f60 qq1
! f62 qq2

	PIC_SET(g5,constants,l5)

	! %o3,%o4,%o5 need to be stored
	st	%f6,[%o3]
	sethi	%hi(0x413921fb),%l7
	st	%f7,[%o3+4]
	or	%l7,%lo(0x413921fb),%l7
	st	%f16,[%o4]
	st	%f17,[%o4+4]
	st	%f26,[%o5]
	st	%f27,[%o5+4]
	ldd	[%l5+invpio2],%f40
	ldd	[%l5+round],%f42
	ldd	[%l5+pio2_1],%f46
	ldd	[%l5+pio2_2],%f48
	ldd	[%l5+pio2_3],%f50
	ldd	[%l5+pio2_3t],%f52
	std	%f54,[%fp+x0_1+8]	! set up stack data
	std	%f54,[%fp+x1_1+8]
	std	%f54,[%fp+x2_1+8]
	stx	%g0,[%fp+y0_0+8]
	stx	%g0,[%fp+y1_0+8]
	stx	%g0,[%fp+y2_0+8]

!	branched here in the middle of the array.  Need to adjust
!	for the members of the triple that were selected in the primary
!	loop.

!	no adjustment since all three selected here
	subcc	LIM_l6,0x1,%g0		! continue in LOOP0?
	bz,a	%icc,.LOOP0
	mov	0x0,LIM_l6		! delay slot set biguns=0

!	ajust 1st triple since 2d and 3d done here
	subcc	LIM_l6,0x2,%g0		! continue in LOOP1?
	fors	%f0,%f9,%f0		! restore sign bit
	fmuld	%f0,%f40,%f2		! adj LOOP0
	bz,a	%icc,.LOOP1
	mov	0x0,LIM_l6		! delay slot set biguns=0

!	ajust 1st and 2d triple since 3d done here
	subcc	LIM_l6,0x3,%g0		! continue in LOOP2?
	!done fmuld	%f0,%f40,%f2		! adj LOOP0
	sub	%i3,%i4,%i3		! adjust to not double increment
	fors	%f10,%f19,%f10		! restore sign bit
	fmuld	%f10,%f40,%f12		! adj LOOP1
	faddd	%f2,%f42,%f2		! adj LOOP1
	bz,a	%icc,.LOOP2
	mov	0x0,LIM_l6		! delay slot set biguns=0

	.align 32
.LOOP0:
	lda	[%i1]%asi,%l1		! preload next argument
	mov	%i3,%o0			! py0 = y
	lda	[%i1]%asi,%f10
	cmp	%l0,%l7
	add	%i3,%i4,%i3		! y += stridey
	bg,pn	%icc,.BIG0		! if hx > 0x413921fb

! delay slot
	lda	[%i1+4]%asi,%f11
	addcc	%i0,-1,%i0
	add	%i1,%i2,%i1		! x += stridex
	ble,pn	%icc,.ENDLOOP1

! delay slot
	andn	%l1,%i5,%l1
	nop
	fmuld	%f0,%f40,%f2
	fabsd	%f54,%f54		! a nop for alignment only

.LOOP1:
	lda	[%i1]%asi,%l2		! preload next argument
	mov	%i3,%o1			! py1 = y

	lda	[%i1]%asi,%f20
	cmp	%l1,%l7
	add	%i3,%i4,%i3		! y += stridey
	bg,pn	%icc,.BIG1		! if hx > 0x413921fb

! delay slot
	lda	[%i1+4]%asi,%f21
	addcc	%i0,-1,%i0
	add	%i1,%i2,%i1		! x += stridex
	ble,pn	%icc,.ENDLOOP2

! delay slot
	andn	%l2,%i5,%l2
	nop
	fmuld	%f10,%f40,%f12
	faddd	%f2,%f42,%f2

.LOOP2:
	st	%f3,[%fp+n0]
	mov	%i3,%o2			! py2 = y

	cmp	%l2,%l7
	add	%i3,%i4,%i3		! y += stridey
	fmuld	%f20,%f40,%f22
	bg,pn	%icc,.BIG2		! if hx > 0x413921fb

! delay slot
	add	%l5,thresh+4,%o7
	faddd	%f12,%f42,%f12
	st	%f13,[%fp+n1]

! -

	add	%l5,thresh,%g1
	faddd	%f22,%f42,%f22
	st	%f23,[%fp+n2]

	fsubd	%f2,%f42,%f2		! n

	fsubd	%f12,%f42,%f12		! n

	fsubd	%f22,%f42,%f22		! n

	fmuld	%f2,%f46,%f4

	fmuld	%f12,%f46,%f14

	fmuld	%f22,%f46,%f24

	fsubd	%f0,%f4,%f4
	fmuld	%f2,%f48,%f6

	fsubd	%f10,%f14,%f14
	fmuld	%f12,%f48,%f16

	fsubd	%f20,%f24,%f24
	fmuld	%f22,%f48,%f26

	fsubd	%f4,%f6,%f0
	ld	[%fp+n0],%o3

	fsubd	%f14,%f16,%f10
	ld	[%fp+n1],%o4

	fsubd	%f24,%f26,%f20
	ld	[%fp+n2],%o5

	fsubd	%f4,%f0,%f32
	and	%o3,1,%o3

	fsubd	%f14,%f10,%f34
	and	%o4,1,%o4

	fsubd	%f24,%f20,%f36
	and	%o5,1,%o5

	fsubd	%f32,%f6,%f32
	fmuld	%f2,%f50,%f8
	sll	%o3,3,%o3

	fsubd	%f34,%f16,%f34
	fmuld	%f12,%f50,%f18
	sll	%o4,3,%o4

	fsubd	%f36,%f26,%f36
	fmuld	%f22,%f50,%f28
	sll	%o5,3,%o5

	fsubd	%f8,%f32,%f8
	ld	[%g1+%o3],%f6

	fsubd	%f18,%f34,%f18
	ld	[%g1+%o4],%f16

	fsubd	%f28,%f36,%f28
	ld	[%g1+%o5],%f26

	fsubd	%f0,%f8,%f4

	fsubd	%f10,%f18,%f14

	fsubd	%f20,%f28,%f24

	fsubd	%f0,%f4,%f32

	fsubd	%f10,%f14,%f34

	fsubd	%f20,%f24,%f36

	fsubd	%f32,%f8,%f32
	fmuld	%f2,%f52,%f2

	fsubd	%f34,%f18,%f34
	fmuld	%f12,%f52,%f12

	fsubd	%f36,%f28,%f36
	fmuld	%f22,%f52,%f22

	fsubd	%f2,%f32,%f2
	ld	[%o7+%o3],%f8

	fsubd	%f12,%f34,%f12
	ld	[%o7+%o4],%f18

	fsubd	%f22,%f36,%f22
	ld	[%o7+%o5],%f28

	fsubd	%f4,%f2,%f0		! x

	fsubd	%f14,%f12,%f10		! x

	fsubd	%f24,%f22,%f20		! x

	fsubd	%f4,%f0,%f4

	fsubd	%f14,%f10,%f14

	fsubd	%f24,%f20,%f24

	fands	%f0,%f30,%f9		! save signbit

	fands	%f10,%f30,%f19		! save signbit

	fands	%f20,%f30,%f29		! save signbit

	fabsd	%f0,%f0
	std	%f0,[%fp+x0_1]

	fabsd	%f10,%f10
	std	%f10,[%fp+x1_1]

	fabsd	%f20,%f20
	std	%f20,[%fp+x2_1]

	fsubd	%f4,%f2,%f2		! y

	fsubd	%f14,%f12,%f12		! y

	fsubd	%f24,%f22,%f22		! y

	fcmpgt32 %f6,%f0,%l0

	fcmpgt32 %f16,%f10,%l1

	fcmpgt32 %f26,%f20,%l2

! -- 16 byte aligned
	fxors	%f2,%f9,%f2

	fxors	%f12,%f19,%f12

	fxors	%f22,%f29,%f22

	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
	andcc	%l0,2,%g0
	bne,pn	%icc,.CASE4

! delay slot
	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
	andcc	%l1,2,%g0
	bne,pn	%icc,.CASE2

! delay slot
	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
	andcc	%l2,2,%g0
	bne,pn	%icc,.CASE1

! delay slot
	fpadd32s %f0,%f31,%f8
	sethi	%hi(0x3fc3c000),%o7
	ld	[%fp+x0_1],%l0

	fpadd32s %f10,%f31,%f18
	add	%l3,8,%g1
	ld	[%fp+x1_1],%l1

	fpadd32s %f20,%f31,%f28
	ld	[%fp+x2_1],%l2

	fand	%f8,%f44,%f4
	sub	%l0,%o7,%l0

	fand	%f18,%f44,%f14
	sub	%l1,%o7,%l1

	fand	%f28,%f44,%f24
	sub	%l2,%o7,%l2

	fsubd	%f0,%f4,%f0
	srl	%l0,10,%l0

	fsubd	%f10,%f14,%f10
	srl	%l1,10,%l1

	fsubd	%f20,%f24,%f20
	srl	%l2,10,%l2

	faddd	%f0,%f2,%f0
	andn	%l0,0x1f,%l0

	faddd	%f10,%f12,%f10
	andn	%l1,0x1f,%l1

	faddd	%f20,%f22,%f20
	andn	%l2,0x1f,%l2

	fmuld	%f0,%f0,%f2
	add	%l0,%o3,%l0

	fmuld	%f10,%f10,%f12
	add	%l1,%o4,%l1

	fmuld	%f20,%f20,%f22
	add	%l2,%o5,%l2

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f34

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f36

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24

	fmuld	%f0,%f6,%f6
	ldd	[%g1+%l0],%f2

	fmuld	%f10,%f16,%f16
	ldd	[%g1+%l1],%f12

	fmuld	%f20,%f26,%f26
	ldd	[%g1+%l2],%f22

	fmuld	%f4,%f32,%f4
	ldd	[%l4+%l0],%f0

	fmuld	%f14,%f34,%f14
	ldd	[%l4+%l1],%f10

	fmuld	%f24,%f36,%f24
	ldd	[%l4+%l2],%f20

	fmuld	%f6,%f2,%f6

	fmuld	%f16,%f12,%f16

	fmuld	%f26,%f22,%f26

	faddd	%f6,%f4,%f6

	faddd	%f16,%f14,%f16

	faddd	%f26,%f24,%f26

	faddd	%f6,%f0,%f6

	faddd	%f16,%f10,%f16

	faddd	%f26,%f20,%f26

	faddd	%f6,%f32,%f6

	faddd	%f16,%f34,%f16

	faddd	%f26,%f36,%f26

.FIXSIGN:
	ld	[%fp+n0],%o3
	add	%l5,thresh-4,%g1

	ld	[%fp+n1],%o4

	ld	[%fp+n2],%o5
	and	%o3,2,%o3

	sll	%o3,2,%o3
	and	%o4,2,%o4
	lda	[%i1]%asi,%l0		! preload next argument

	sll	%o4,2,%o4
	and	%o5,2,%o5
	ld	[%g1+%o3],%f8

	sll	%o5,2,%o5
	ld	[%g1+%o4],%f18

	ld	[%g1+%o5],%f28
	fxors	%f9,%f8,%f9

	lda	[%i1]%asi,%f0
	fxors	%f29,%f28,%f29

	lda	[%i1+4]%asi,%f1
	fxors	%f19,%f18,%f19

	fors	%f6,%f9,%f6		! tack on sign
	add	%i1,%i2,%i1		! x += stridex
	st	%f6,[%o0]

	fors	%f26,%f29,%f26		! tack on sign
	st	%f7,[%o0+4]

	fors	%f16,%f19,%f16		! tack on sign
	st	%f26,[%o2]

	st	%f27,[%o2+4]
	addcc	%i0,-1,%i0

	st	%f16,[%o1]
	andn	%l0,%i5,%l0		! hx &= ~0x80000000
	bg,pt	%icc,.LOOP0

! delay slot
	st	%f17,[%o1+4]

	ba,pt	%icc,.ENDLOOP0
! delay slot
	nop

	.align	32
.CASE1:
	fpadd32s %f10,%f31,%f18
	sethi	%hi(0x3fc3c000),%o7
	ld	[%fp+x0_1],%l0

	fand	%f8,%f44,%f4
	add	%l3,8,%g1
	ld	[%fp+x1_1],%l1

	fand	%f18,%f44,%f14
	sub	%l0,%o7,%l0

	fsubd	%f0,%f4,%f0
	srl	%l0,10,%l0
	sub	%l1,%o7,%l1

	fsubd	%f10,%f14,%f10
	srl	%l1,10,%l1

	fmuld	%f20,%f20,%f20
	ldd	[%l5+%o5],%f36
	add	%l5,%o5,%l2

	faddd	%f0,%f2,%f0
	andn	%l0,0x1f,%l0

	faddd	%f10,%f12,%f10
	andn	%l1,0x1f,%l1

	fmuld	%f20,%f36,%f24
	ldd	[%l2+0x10],%f26
	add	%fp,%o5,%o5

	fmuld	%f0,%f0,%f2
	add	%l0,%o3,%l0

	fmuld	%f10,%f10,%f12
	add	%l1,%o4,%l1

	faddd	%f24,%f26,%f24
	ldd	[%l2+0x20],%f36

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f34

	fmuld	%f20,%f24,%f24
	ldd	[%l2+0x30],%f26

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14

	faddd	%f24,%f36,%f24
	ldd	[%o5+x2_1],%f36

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f20,%f24,%f24

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4
	ldd	[%g1+%l0],%f2

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14
	ldd	[%g1+%l1],%f12

	faddd	%f24,%f26,%f24

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f0

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f10

	fmuld	%f4,%f32,%f4
	std	%f22,[%fp+y2_0]

	fmuld	%f14,%f34,%f14

	fmuld	%f6,%f2,%f6

	fmuld	%f16,%f12,%f16

	fmuld	%f20,%f24,%f24

	faddd	%f6,%f4,%f6

	faddd	%f16,%f14,%f16

	fmuld	%f36,%f24,%f24
	ldd	[%o5+y2_0],%f22

	faddd	%f6,%f0,%f6

	faddd	%f16,%f10,%f16

	faddd	%f24,%f22,%f24

	faddd	%f6,%f32,%f6

	faddd	%f16,%f34,%f16
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f36,%f24,%f26

	.align	32
.CASE2:
	fpadd32s %f0,%f31,%f8
	ld	[%fp+x0_1],%l0
	andcc	%l2,2,%g0
	bne,pn	%icc,.CASE3

! delay slot
	sethi	%hi(0x3fc3c000),%o7
	fpadd32s %f20,%f31,%f28
	ld	[%fp+x2_1],%l2

	fand	%f8,%f44,%f4
	sub	%l0,%o7,%l0
	add	%l3,8,%g1

	fand	%f28,%f44,%f24
	sub	%l2,%o7,%l2

	fsubd	%f0,%f4,%f0
	srl	%l0,10,%l0

	fsubd	%f20,%f24,%f20
	srl	%l2,10,%l2

	fmuld	%f10,%f10,%f10
	ldd	[%l5+%o4],%f34
	add	%l5,%o4,%l1

	faddd	%f0,%f2,%f0
	andn	%l0,0x1f,%l0

	faddd	%f20,%f22,%f20
	andn	%l2,0x1f,%l2

	fmuld	%f10,%f34,%f14
	ldd	[%l1+0x10],%f16
	add	%fp,%o4,%o4

	fmuld	%f0,%f0,%f2
	add	%l0,%o3,%l0

	fmuld	%f20,%f20,%f22
	add	%l2,%o5,%l2

	faddd	%f14,%f16,%f14
	ldd	[%l1+0x20],%f34

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f36

	fmuld	%f10,%f14,%f14
	ldd	[%l1+0x30],%f16

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24

	faddd	%f14,%f34,%f14
	ldd	[%o4+x1_1],%f34

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f10,%f14,%f14

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4
	ldd	[%g1+%l0],%f2

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24
	ldd	[%g1+%l2],%f22

	faddd	%f14,%f16,%f14

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f0

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f20

	fmuld	%f4,%f32,%f4
	std	%f12,[%fp+y1_0]

	fmuld	%f24,%f36,%f24

	fmuld	%f6,%f2,%f6

	fmuld	%f26,%f22,%f26

	fmuld	%f10,%f14,%f14

	faddd	%f6,%f4,%f6

	faddd	%f26,%f24,%f26

	fmuld	%f34,%f14,%f14
	ldd	[%o4+y1_0],%f12

	faddd	%f6,%f0,%f6

	faddd	%f26,%f20,%f26

	faddd	%f14,%f12,%f14

	faddd	%f6,%f32,%f6

	faddd	%f26,%f36,%f26
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f34,%f14,%f16

	.align	32
.CASE3:
	fand	%f8,%f44,%f4
	add	%l3,8,%g1
	sub	%l0,%o7,%l0

	fmuld	%f10,%f10,%f10
	ldd	[%l5+%o4],%f34
	add	%l5,%o4,%l1

	fsubd	%f0,%f4,%f0
	srl	%l0,10,%l0

	fmuld	%f20,%f20,%f20
	ldd	[%l5+%o5],%f36
	add	%l5,%o5,%l2

	fmuld	%f10,%f34,%f14
	ldd	[%l1+0x10],%f16
	add	%fp,%o4,%o4

	faddd	%f0,%f2,%f0
	andn	%l0,0x1f,%l0

	fmuld	%f20,%f36,%f24
	ldd	[%l2+0x10],%f26
	add	%fp,%o5,%o5

	faddd	%f14,%f16,%f14
	ldd	[%l1+0x20],%f34

	fmuld	%f0,%f0,%f2
	add	%l0,%o3,%l0

	faddd	%f24,%f26,%f24
	ldd	[%l2+0x20],%f36

	fmuld	%f10,%f14,%f14
	ldd	[%l1+0x30],%f16

	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32

	fmuld	%f20,%f24,%f24
	ldd	[%l2+0x30],%f26

	faddd	%f14,%f34,%f14
	ldd	[%o4+x1_1],%f34

	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4

	faddd	%f24,%f36,%f24
	ldd	[%o5+x2_1],%f36

	fmuld	%f10,%f14,%f14
	std	%f12,[%fp+y1_0]

	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4

	fmuld	%f20,%f24,%f24
	std	%f22,[%fp+y2_0]

	faddd	%f14,%f16,%f14

	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4
	ldd	[%g1+%l0],%f2

	faddd	%f24,%f26,%f24

	fmuld	%f10,%f14,%f14

	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f0

	fmuld	%f4,%f32,%f4

	fmuld	%f20,%f24,%f24

	fmuld	%f6,%f2,%f6

	fmuld	%f34,%f14,%f14
	ldd	[%o4+y1_0],%f12

	fmuld	%f36,%f24,%f24
	ldd	[%o5+y2_0],%f22

	faddd	%f6,%f4,%f6

	faddd	%f14,%f12,%f14

	faddd	%f24,%f22,%f24

	faddd	%f6,%f0,%f6

	faddd	%f34,%f14,%f16

	faddd	%f36,%f24,%f26
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f6,%f32,%f6

	.align	32
.CASE4:
	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
	sethi	%hi(0x3fc3c000),%o7
	andcc	%l1,2,%g0
	bne,pn	%icc,.CASE6

! delay slot
	andcc	%l2,2,%g0
	fpadd32s %f10,%f31,%f18
	ld	[%fp+x1_1],%l1
	bne,pn	%icc,.CASE5

! delay slot
	add	%l3,8,%g1
	ld	[%fp+x2_1],%l2
	fpadd32s %f20,%f31,%f28

	fand	%f18,%f44,%f14
	sub	%l1,%o7,%l1

	fand	%f28,%f44,%f24
	sub	%l2,%o7,%l2

	fsubd	%f10,%f14,%f10
	srl	%l1,10,%l1

	fsubd	%f20,%f24,%f20
	srl	%l2,10,%l2

	fmuld	%f0,%f0,%f0
	ldd	[%l5+%o3],%f32
	add	%l5,%o3,%l0

	faddd	%f10,%f12,%f10
	andn	%l1,0x1f,%l1

	faddd	%f20,%f22,%f20
	andn	%l2,0x1f,%l2

	fmuld	%f0,%f32,%f4
	ldd	[%l0+0x10],%f6
	add	%fp,%o3,%o3

	fmuld	%f10,%f10,%f12
	add	%l1,%o4,%l1

	fmuld	%f20,%f20,%f22
	add	%l2,%o5,%l2

	faddd	%f4,%f6,%f4
	ldd	[%l0+0x20],%f32

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f34

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f36

	fmuld	%f0,%f4,%f4
	ldd	[%l0+0x30],%f6

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24

	faddd	%f4,%f32,%f4
	ldd	[%o3+x0_1],%f32

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f0,%f4,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14
	ldd	[%g1+%l1],%f12

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24
	ldd	[%g1+%l2],%f22

	faddd	%f4,%f6,%f4

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f10

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f20

	fmuld	%f14,%f34,%f14
	std	%f2,[%fp+y0_0]

	fmuld	%f24,%f36,%f24

	fmuld	%f0,%f4,%f4

	fmuld	%f16,%f12,%f16

	fmuld	%f26,%f22,%f26

	fmuld	%f32,%f4,%f4
	ldd	[%o3+y0_0],%f2

	faddd	%f16,%f14,%f16

	faddd	%f26,%f24,%f26

	faddd	%f4,%f2,%f4

	faddd	%f16,%f10,%f16

	faddd	%f26,%f20,%f26

	faddd	%f32,%f4,%f6

	faddd	%f16,%f34,%f16
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f26,%f36,%f26

	.align	32
.CASE5:
	fand	%f18,%f44,%f14
	sub	%l1,%o7,%l1

	fmuld	%f0,%f0,%f0
	ldd	[%l5+%o3],%f32
	add	%l5,%o3,%l0

	fsubd	%f10,%f14,%f10
	srl	%l1,10,%l1

	fmuld	%f20,%f20,%f20
	ldd	[%l5+%o5],%f36
	add	%l5,%o5,%l2

	fmuld	%f0,%f32,%f4
	ldd	[%l0+0x10],%f6
	add	%fp,%o3,%o3

	faddd	%f10,%f12,%f10
	andn	%l1,0x1f,%l1

	fmuld	%f20,%f36,%f24
	ldd	[%l2+0x10],%f26
	add	%fp,%o5,%o5

	faddd	%f4,%f6,%f4
	ldd	[%l0+0x20],%f32

	fmuld	%f10,%f10,%f12
	add	%l1,%o4,%l1

	faddd	%f24,%f26,%f24
	ldd	[%l2+0x20],%f36

	fmuld	%f0,%f4,%f4
	ldd	[%l0+0x30],%f6

	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f34

	fmuld	%f20,%f24,%f24
	ldd	[%l2+0x30],%f26

	faddd	%f4,%f32,%f4
	ldd	[%o3+x0_1],%f32

	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14

	faddd	%f24,%f36,%f24
	ldd	[%o5+x2_1],%f36

	fmuld	%f0,%f4,%f4
	std	%f2,[%fp+y0_0]

	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14

	fmuld	%f20,%f24,%f24
	std	%f22,[%fp+y2_0]

	faddd	%f4,%f6,%f4

	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14
	ldd	[%g1+%l1],%f12

	faddd	%f24,%f26,%f24

	fmuld	%f0,%f4,%f4

	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f10

	fmuld	%f14,%f34,%f14

	fmuld	%f20,%f24,%f24

	fmuld	%f16,%f12,%f16

	fmuld	%f32,%f4,%f4
	ldd	[%o3+y0_0],%f2

	fmuld	%f36,%f24,%f24
	ldd	[%o5+y2_0],%f22

	faddd	%f16,%f14,%f16

	faddd	%f4,%f2,%f4

	faddd	%f24,%f22,%f24

	faddd	%f16,%f10,%f16

	faddd	%f32,%f4,%f6

	faddd	%f36,%f24,%f26
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f16,%f34,%f16

	.align	32
.CASE6:
	ld	[%fp+x2_1],%l2
	add	%l3,8,%g1
	bne,pn	%icc,.CASE7
! delay slot
	fpadd32s %f20,%f31,%f28

	fand	%f28,%f44,%f24
	ldd	[%l5+%o3],%f32
	add	%l5,%o3,%l0

	fmuld	%f0,%f0,%f0
	sub	%l2,%o7,%l2

	fsubd	%f20,%f24,%f20
	srl	%l2,10,%l2

	fmuld	%f10,%f10,%f10
	ldd	[%l5+%o4],%f34
	add	%l5,%o4,%l1

	fmuld	%f0,%f32,%f4
	ldd	[%l0+0x10],%f6
	add	%fp,%o3,%o3

	faddd	%f20,%f22,%f20
	andn	%l2,0x1f,%l2

	fmuld	%f10,%f34,%f14
	ldd	[%l1+0x10],%f16
	add	%fp,%o4,%o4

	faddd	%f4,%f6,%f4
	ldd	[%l0+0x20],%f32

	fmuld	%f20,%f20,%f22
	add	%l2,%o5,%l2

	faddd	%f14,%f16,%f14
	ldd	[%l1+0x20],%f34

	fmuld	%f0,%f4,%f4
	ldd	[%l0+0x30],%f6

	fmuld	%f22,%f58,%f26
	ldd	[%l3+%l2],%f36

	fmuld	%f10,%f14,%f14
	ldd	[%l1+0x30],%f16

	faddd	%f4,%f32,%f4
	ldd	[%o3+x0_1],%f32

	faddd	%f26,%f56,%f26
	fmuld	%f22,%f62,%f24

	faddd	%f14,%f34,%f14
	ldd	[%o4+x1_1],%f34

	fmuld	%f0,%f4,%f4
	std	%f2,[%fp+y0_0]

	fmuld	%f22,%f26,%f26
	faddd	%f24,%f60,%f24

	fmuld	%f10,%f14,%f14
	std	%f12,[%fp+y1_0]

	faddd	%f4,%f6,%f4

	faddd	%f26,%f54,%f26
	fmuld	%f22,%f24,%f24
	ldd	[%g1+%l2],%f22

	faddd	%f14,%f16,%f14

	fmuld	%f0,%f4,%f4

	fmuld	%f20,%f26,%f26
	ldd	[%l4+%l2],%f20

	fmuld	%f24,%f36,%f24

	fmuld	%f10,%f14,%f14

	fmuld	%f26,%f22,%f26

	fmuld	%f32,%f4,%f4
	ldd	[%o3+y0_0],%f2

	fmuld	%f34,%f14,%f14
	ldd	[%o4+y1_0],%f12

	faddd	%f26,%f24,%f26

	faddd	%f4,%f2,%f4

	faddd	%f14,%f12,%f14

	faddd	%f26,%f20,%f26

	faddd	%f32,%f4,%f6

	faddd	%f34,%f14,%f16
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f26,%f36,%f26

	.align	32
.CASE7:
	fmuld	%f0,%f0,%f0
	ldd	[%l5+%o3],%f32
	add	%l5,%o3,%l0

	fmuld	%f10,%f10,%f10
	ldd	[%l5+%o4],%f34
	add	%l5,%o4,%l1

	fmuld	%f20,%f20,%f20
	ldd	[%l5+%o5],%f36
	add	%l5,%o5,%l2

	fmuld	%f0,%f32,%f4
	ldd	[%l0+0x10],%f6
	add	%fp,%o3,%o3

	fmuld	%f10,%f34,%f14
	ldd	[%l1+0x10],%f16
	add	%fp,%o4,%o4

	fmuld	%f20,%f36,%f24
	ldd	[%l2+0x10],%f26
	add	%fp,%o5,%o5

	faddd	%f4,%f6,%f4
	ldd	[%l0+0x20],%f32

	faddd	%f14,%f16,%f14
	ldd	[%l1+0x20],%f34

	faddd	%f24,%f26,%f24
	ldd	[%l2+0x20],%f36

	fmuld	%f0,%f4,%f4
	ldd	[%l0+0x30],%f6

	fmuld	%f10,%f14,%f14
	ldd	[%l1+0x30],%f16

	fmuld	%f20,%f24,%f24
	ldd	[%l2+0x30],%f26

	faddd	%f4,%f32,%f4
	ldd	[%o3+x0_1],%f32

	faddd	%f14,%f34,%f14
	ldd	[%o4+x1_1],%f34

	faddd	%f24,%f36,%f24
	ldd	[%o5+x2_1],%f36

	fmuld	%f0,%f4,%f4
	std	%f2,[%fp+y0_0]

	fmuld	%f10,%f14,%f14
	std	%f12,[%fp+y1_0]

	fmuld	%f20,%f24,%f24
	std	%f22,[%fp+y2_0]

	faddd	%f4,%f6,%f4

	faddd	%f14,%f16,%f14

	faddd	%f24,%f26,%f24

	fmuld	%f0,%f4,%f4

	fmuld	%f10,%f14,%f14

	fmuld	%f20,%f24,%f24

	fmuld	%f32,%f4,%f4
	ldd	[%o3+y0_0],%f2

	fmuld	%f34,%f14,%f14
	ldd	[%o4+y1_0],%f12

	fmuld	%f36,%f24,%f24
	ldd	[%o5+y2_0],%f22

	faddd	%f4,%f2,%f4

	faddd	%f14,%f12,%f14

	faddd	%f24,%f22,%f24

	faddd	%f32,%f4,%f6

	faddd	%f34,%f14,%f16
	ba,pt	%icc,.FIXSIGN

! delay slot
	faddd	%f36,%f24,%f26


	.align	32
.ENDLOOP2:
	fmuld	%f10,%f40,%f12
	add	%l5,thresh,%g1
	faddd	%f12,%f42,%f12
	st	%f13,[%fp+n1]
	fsubd	%f12,%f42,%f12		! n
	fmuld	%f12,%f46,%f14
	fsubd	%f10,%f14,%f14
	fmuld	%f12,%f48,%f16
	fsubd	%f14,%f16,%f10
	ld	[%fp+n1],%o4
	fsubd	%f14,%f10,%f34
	and	%o4,1,%o4
	fsubd	%f34,%f16,%f34
	fmuld	%f12,%f50,%f18
	sll	%o4,3,%o4
	fsubd	%f18,%f34,%f18
	ld	[%g1+%o4],%f16
	fsubd	%f10,%f18,%f14
	fsubd	%f10,%f14,%f34
	add	%l5,thresh+4,%o7
	fsubd	%f34,%f18,%f34
	fmuld	%f12,%f52,%f12
	fsubd	%f12,%f34,%f12
	ld	[%o7+%o4],%f18
	fsubd	%f14,%f12,%f10		! x
	fsubd	%f14,%f10,%f14
	fands	%f10,%f30,%f19		! save signbit
	fabsd	%f10,%f10
	std	%f10,[%fp+x1_1]
	fsubd	%f14,%f12,%f12		! y
	fcmpgt32 %f16,%f10,%l1
	fxors	%f12,%f19,%f12
	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
	andcc	%l1,2,%g0
	bne,pn	%icc,1f
! delay slot
	nop
	fpadd32s %f10,%f31,%f18
	ld	[%fp+x1_1],%l1
	fand	%f18,%f44,%f14
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fsubd	%f10,%f14,%f10
	sub	%l1,%o7,%l1
	srl	%l1,10,%l1
	faddd	%f10,%f12,%f10
	andn	%l1,0x1f,%l1
	fmuld	%f10,%f10,%f12
	add	%l1,%o4,%l1
	fmuld	%f12,%f58,%f16
	ldd	[%l3+%l1],%f34
	faddd	%f16,%f56,%f16
	fmuld	%f12,%f62,%f14
	fmuld	%f12,%f16,%f16
	faddd	%f14,%f60,%f14
	faddd	%f16,%f54,%f16
	fmuld	%f12,%f14,%f14
	ldd	[%g1+%l1],%f12
	fmuld	%f10,%f16,%f16
	ldd	[%l4+%l1],%f10
	fmuld	%f14,%f34,%f14
	fmuld	%f16,%f12,%f16
	faddd	%f16,%f14,%f16
	faddd	%f16,%f10,%f16
	ba,pt	%icc,2f
	faddd	%f16,%f34,%f16
1:
	fmuld	%f10,%f10,%f10
	ldd	[%l5+%o4],%f34
	add	%l5,%o4,%l1
	fmuld	%f10,%f34,%f14
	ldd	[%l1+0x10],%f16
	add	%fp,%o4,%o4
	faddd	%f14,%f16,%f14
	ldd	[%l1+0x20],%f34
	fmuld	%f10,%f14,%f14
	ldd	[%l1+0x30],%f16
	faddd	%f14,%f34,%f14
	ldd	[%o4+x1_1],%f34
	fmuld	%f10,%f14,%f14
	std	%f12,[%fp+y1_0]
	faddd	%f14,%f16,%f14
	fmuld	%f10,%f14,%f14
	fmuld	%f34,%f14,%f14
	ldd	[%o4+y1_0],%f12
	faddd	%f14,%f12,%f14
	faddd	%f34,%f14,%f16
2:
	add	%l5,thresh-4,%g1
	ld	[%fp+n1],%o4
	and	%o4,2,%o4
	sll	%o4,2,%o4
	ld	[%g1+%o4],%f18
	fxors	%f19,%f18,%f19
	fors	%f16,%f19,%f16		! tack on sign
	st	%f16,[%o1]
	st	%f17,[%o1+4]

.ENDLOOP1:
	fmuld	%f0,%f40,%f2
	add	%l5,thresh,%g1
	faddd	%f2,%f42,%f2
	st	%f3,[%fp+n0]
	fsubd	%f2,%f42,%f2		! n
	fmuld	%f2,%f46,%f4
	fsubd	%f0,%f4,%f4
	fmuld	%f2,%f48,%f6
	fsubd	%f4,%f6,%f0
	ld	[%fp+n0],%o3
	fsubd	%f4,%f0,%f32
	and	%o3,1,%o3
	fsubd	%f32,%f6,%f32
	fmuld	%f2,%f50,%f8
	sll	%o3,3,%o3
	fsubd	%f8,%f32,%f8
	ld	[%g1+%o3],%f6
	fsubd	%f0,%f8,%f4
	fsubd	%f0,%f4,%f32
	add	%l5,thresh+4,%o7
	fsubd	%f32,%f8,%f32
	fmuld	%f2,%f52,%f2
	fsubd	%f2,%f32,%f2
	ld	[%o7+%o3],%f8
	fsubd	%f4,%f2,%f0		! x
	fsubd	%f4,%f0,%f4
	fands	%f0,%f30,%f9		! save signbit
	fabsd	%f0,%f0
	std	%f0,[%fp+x0_1]
	fsubd	%f4,%f2,%f2		! y
	fcmpgt32 %f6,%f0,%l0
	fxors	%f2,%f9,%f2
	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
	andcc	%l0,2,%g0
	bne,pn	%icc,1f
! delay slot
	nop
	fpadd32s %f0,%f31,%f8
	ld	[%fp+x0_1],%l0
	fand	%f8,%f44,%f4
	sethi	%hi(0x3fc3c000),%o7
	add	%l3,8,%g1
	fsubd	%f0,%f4,%f0
	sub	%l0,%o7,%l0
	srl	%l0,10,%l0
	faddd	%f0,%f2,%f0
	andn	%l0,0x1f,%l0
	fmuld	%f0,%f0,%f2
	add	%l0,%o3,%l0
	fmuld	%f2,%f58,%f6
	ldd	[%l3+%l0],%f32
	faddd	%f6,%f56,%f6
	fmuld	%f2,%f62,%f4
	fmuld	%f2,%f6,%f6
	faddd	%f4,%f60,%f4
	faddd	%f6,%f54,%f6
	fmuld	%f2,%f4,%f4
	ldd	[%g1+%l0],%f2
	fmuld	%f0,%f6,%f6
	ldd	[%l4+%l0],%f0
	fmuld	%f4,%f32,%f4
	fmuld	%f6,%f2,%f6
	faddd	%f6,%f4,%f6
	faddd	%f6,%f0,%f6
	ba,pt	%icc,2f
	faddd	%f6,%f32,%f6
1:
	fmuld	%f0,%f0,%f0
	ldd	[%l5+%o3],%f32
	add	%l5,%o3,%l0
	fmuld	%f0,%f32,%f4
	ldd	[%l0+0x10],%f6
	add	%fp,%o3,%o3
	faddd	%f4,%f6,%f4
	ldd	[%l0+0x20],%f32
	fmuld	%f0,%f4,%f4
	ldd	[%l0+0x30],%f6
	faddd	%f4,%f32,%f4
	ldd	[%o3+x0_1],%f32
	fmuld	%f0,%f4,%f4
	std	%f2,[%fp+y0_0]
	faddd	%f4,%f6,%f4
	fmuld	%f0,%f4,%f4
	fmuld	%f32,%f4,%f4
	ldd	[%o3+y0_0],%f2
	faddd	%f4,%f2,%f4
	faddd	%f32,%f4,%f6
2:
	add	%l5,thresh-4,%g1
	ld	[%fp+n0],%o3
	and	%o3,2,%o3
	sll	%o3,2,%o3
	ld	[%g1+%o3],%f8
	fxors	%f9,%f8,%f9
	fors	%f6,%f9,%f6		! tack on sign
	st	%f6,[%o0]
	st	%f7,[%o0+4]

.ENDLOOP0:

! check for huge arguments remaining

	tst	LIM_l6
	be,pt	%icc,.exit
! delay slot
	nop

! ========== huge range (use C code) ==========

#ifdef __sparcv9
	ldx	[%fp+xsave],%o1
	ldx	[%fp+ysave],%o3
#else
	ld	[%fp+xsave],%o1
	ld	[%fp+ysave],%o3
#endif
	ld	[%fp+nsave],%o0
	ld	[%fp+sxsave],%o2
	ld	[%fp+sysave],%o4
	sra	%o2,0,%o2		! sign-extend for V9
	sra	%o4,0,%o4
	call	__vlibm_vsin_big
	mov	%l7,%o5			! delay slot

.exit:
	ret
	restore


	.align	32
.SKIP0:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP0
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l1,%i5,%l0		! hx &= ~0x80000000
	fmovs	%f10,%f0
	ld	[%i1+4],%f1
	ba,pt	%icc,.LOOP0
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.SKIP1:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP1
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	andn	%l2,%i5,%l1		! hx &= ~0x80000000
	fmovs	%f20,%f10
	ld	[%i1+4],%f11
	ba,pt	%icc,.LOOP1
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.SKIP2:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP2
! delay slot, harmless if branch taken
	add	%i3,%i4,%i3		! y += stridey
	ld	[%i1],%l2
	ld	[%i1],%f20
	ld	[%i1+4],%f21
	andn	%l2,%i5,%l2		! hx &= ~0x80000000
	ba,pt	%icc,.LOOP2
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.BIG0:
	sethi	%hi(0x7ff00000),%o7
	cmp	%l0,%o7
	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
	mov	%l7,LIM_l6		! set biguns flag or
	fsubd	%f0,%f0,%f0		! y = x - x
	st	%f0,[%o0]
	st	%f1,[%o0+4]
1:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP0
! delay slot, harmless if branch taken
	andn	%l1,%i5,%l0		! hx &= ~0x80000000
	fmovd	%f10,%f0
	ba,pt	%icc,.LOOP0
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.BIG1:
	sethi	%hi(0x7ff00000),%o7
	cmp	%l1,%o7
	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
	mov	%l7,LIM_l6		! set biguns flag or
	fsubd	%f10,%f10,%f10		! y = x - x
	st	%f10,[%o1]
	st	%f11,[%o1+4]
1:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP1
! delay slot, harmless if branch taken
	andn	%l2,%i5,%l1		! hx &= ~0x80000000
	fmovd	%f20,%f10
	ba,pt	%icc,.LOOP1
! delay slot
	add	%i1,%i2,%i1		! x += stridex


	.align	32
.BIG2:
	sethi	%hi(0x7ff00000),%o7
	cmp	%l2,%o7
	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
! delay slot, annulled if branch not taken
	mov	%l7,LIM_l6		! set biguns flag or
	fsubd	%f20,%f20,%f20		! y = x - x
	st	%f20,[%o2]
	st	%f21,[%o2+4]
1:
	addcc	%i0,-1,%i0
	ble,pn	%icc,.ENDLOOP2
! delay slot
	nop
	ld	[%i1],%l2
	ld	[%i1],%f20
	ld	[%i1+4],%f21
	andn	%l2,%i5,%l2		! hx &= ~0x80000000
	ba,pt	%icc,.LOOP2
! delay slot
	add	%i1,%i2,%i1		! x += stridex

	SET_SIZE(__vsin)

